Rebrickable exploration

Mateusz Tabaszewski, Bartłomiej Pukacki

2023-04-11

Prerequisite libraries:

#install.packages('R.utils', repos = "http://cran.us.r-project.org")
#install.packages("treemapify", repos = "http://cran.us.r-project.org")
library(data.table)
library(tidyr)
library(ggplot2)
library(dplyr)
library(treemapify)
library(ggridges)
library(DT)
library(plotly)
colors <- fread("rebrickable/colors.csv.gz")
inventory_parts <- fread("rebrickable/inventory_parts.csv.gz")
inventories <- fread("rebrickable/inventories.csv.gz")
sets <- fread("rebrickable/sets.csv.gz")
themes <- fread("rebrickable/themes.csv.gz")
part_categories <- fread("rebrickable/part_categories.csv.gz")
parts <- fread("rebrickable/parts.csv.gz")
background_color <- "#2d2d2d"
font_color <- "white"
tick_color <- "#DDDDDD"
title_size <- 20
label_size <- 16
tick_size <- 12
grid_size <- 0.5

code_chunk_length <- 9.67

orange <- "#FF7E67"
coral <- "#FFFFC0"
teal <- "#40C0C0"

Themes


In this section we will explore data regarding themes.

parents <- themes %>% filter(is.na(parent_id)) %>% group_by(id, name)

find_parent <- function(parent_id) {
  if (is.na(parent_id)) {
    "parent"
  } else {
    prev_id <- parent_id
    while(!is.na(parent_id)){
      prev_id <- parent_id
      parent_id <- themes$parent_id[themes$id==parent_id]
    }
    parents$name[parents$id==prev_id]
  }
}

set_themes_children <- rename(sets, set_name=name) %>% 
  merge(rename(themes, theme_name=name), by.x="theme_id", by.y="id") 

set_themes_children$parent_name <- lapply(set_themes_children$parent_id, find_parent)
set_themes_children <- set_themes_children %>% 
  mutate(parent_name = ifelse(parent_name=="parent", theme_name, parent_name))

Top 5 cumsum

merged_df <- merge(sets, themes, by.x = "theme_id", by.y = "id", all.x = TRUE)

selected_years <- c(1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020)

popular_themes <- merged_df %>%
  count(name.y, sort = TRUE) %>%
  head(5)

popular_themes <- popular_themes$name.y[1:5] # use the top 10 popular themes
merged_df_filtered <- merged_df[merged_df$name.y %in% popular_themes, ]

theme_year_counts <- merged_df_filtered %>%
  group_by(name.y, year) %>%
  summarise(num_sets = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'name.y'. You can override using the
## `.groups` argument.
theme_year_counts_cumsum <- theme_year_counts %>%
  group_by(name.y) %>%
  mutate(cum_sum = cumsum(num_sets))

ggplot(data = theme_year_counts_cumsum, aes(x = year, y = cum_sum, color = name.y)) +
  geom_line(size = 1) +
  scale_x_continuous(breaks = selected_years) +
  labs(x = "Year", y = "Number of Sets", color = "Theme") +
  ggtitle("Cumulative Sum of Sets for Chosen Themes")

Star wars

star_wars_counts <- theme_year_counts %>% filter(name.y == "Star Wars")
film_releases=c(1999,2002,2005,2015,2017,2019)
film_colors=c("red", "orange", "green", "blue", "purple", "brown")
film_labels=c("The Phantom Menace", "Attack of the Clones", "Revenge of the Sith",
              "The Force Awakens", "The Last Jedi", "The Rise of Skywalker")

ggplot(data = star_wars_counts, aes(x = year, y = num_sets)) +
  geom_line(size = 1) +
  scale_x_continuous(breaks = selected_years) +
  labs(x = "Year", y = "Number of Sets") +
  ggtitle("Number of Produced Star Wars Sets") +
  geom_vline(xintercept = film_releases, color = film_colors, linetype="dotted") +
  annotate("text", x = film_releases[1:3]+0.2, y = max(star_wars_counts$num_sets)*0.4, 
           label = film_labels[1:3], color = film_colors[1:3], angle = 90, hjust = -0.2, size=3) +
  annotate("text", x = film_releases[4:6]+0.2, y = 0, label = film_labels[4:6], 
           color = film_colors[4:6], angle = 90, hjust = -0.2, size=3) +
  theme(legend.position = "none")


Themes ~ sets


Table of parent themes

set_themes <- sets %>% 
  rename(set_name=name) %>%
  merge(rename(themes, theme_name=name), by.x = "theme_id", by.y = "id")

set_theme_inventories <- inventories %>% merge(set_themes, by="set_num")
set_theme_inventories_parts <- inventory_parts %>% merge(set_theme_inventories, by.x="inventory_id", by.y="id")
set_theme_inventories_parts_colors <- colors %>% rename(color_name=name) %>% 
  merge(set_theme_inventories_parts, by.x="id", by.y="color_id") %>% rename(color_id=id)

theme_colors <- set_theme_inventories_parts_colors %>% filter(is.na(parent_id)) %>%
  select(one_of(c("color_name","rgb","theme_name","theme_id","color_id")))

grouped_theme_colors <- theme_colors %>% group_by(theme_name, color_name, color_id, rgb) %>%
  summarise(count=n(), .groups="keep") %>% arrange(theme_name, desc(count))

prettyTable <- function(table_df, round_columns_func=is.numeric, round_digits=0) {
  DT::datatable(table_df, style="bootstrap", filter = "top", rownames = FALSE, extensions = "Buttons", 
                options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print'))) %>%
    formatRound(unlist(lapply(table_df, round_columns_func)), round_digits)
}

prettyTable(grouped_theme_colors)

Parts


merged_df <- merge(sets, themes, by.x = "theme_id", by.y = "id", all.x = TRUE)
merged_df_inventories <- merge(inventories, merged_df, by = "set_num", all.x = TRUE)
inventory_parts <- inventory_parts %>% rename(id = inventory_id)
part_categories <- inventory_parts %>% rename(part_cat_id = id)

merged_df_inventory_parts <- merge(merged_df_inventories, inventory_parts, by="id")

merged_df_inventory_parts_final <- merge(merged_df_inventory_parts, parts, by = "part_num")

parts_count <- merged_df_inventory_parts_final %>% count(part_num, name = "count")
merged_counts <- merge(merged_df_inventory_parts_final, parts_count, by = "part_num",   all.x=T)

Unique parts in years

result <- merged_counts %>%
  filter(count == 1)
result<-result %>%
  count(year)
result<-na.omit(result)

ggplot(result, aes(x = year, y = n)) +
  geom_line() +
  labs(x = "Year", y = "Number of Unique Lego Parts") +
  ggtitle("Number of Unique Lego Blocks per Year")

Unique cumsum

result_cumsum <- result %>%
  arrange(year) %>%
  mutate(n_cumsum = cumsum(n))

ggplot(result_cumsum, aes(x = year, y = n_cumsum)) +
  geom_line() +
  labs(x = "Year", y = "Number of Rare Lego Parts") +
  ggtitle("Cumulative Sum of Rare Lego Parts through Years")

Unique vs average in a theme

result <- merged_counts
theme_counts <- result %>%
  group_by(name.y) %>%
  summarize(num_parts = mean(num_parts), count = sum(count)) %>%
  arrange(desc(count))

theme_counts<-head(arrange(theme_counts,desc(count)),100)
theme_counts <- theme_counts %>% rename('Average Number of Parts' = num_parts) %>%
  rename('Number of Rare Lego Parts'=count)

ggplotly(ggplot(theme_counts, aes(x = theme_counts$'Average Number of Parts',
                                  y = theme_counts$'Number of Rare Lego Parts',
                                  text = name.y)) +
           geom_point() +
           labs(x = "Average Number of Parts", y = "Number of Rare Lego Parts") +
           ggtitle("Number of Rare Lego Parts vs Average Number of Parts for a Theme"))

Average parts per set

merged_df <- merge(sets, themes, by.x = "theme_id", by.y = "id", all.x = TRUE)

avg_parts_per_year <- merged_df %>%
  group_by(year) %>%
  summarize(avg_parts = mean(num_parts))

ggplot(data = avg_parts_per_year, aes(x = year, y = avg_parts)) +
  geom_line(size = 1) +
  labs(x = "Year", y = "Average Number of Parts per Set") +
  ggtitle("Average Number of Parts per Set per Year")


Parts ~ themes


20 themes with the highest amount of unique lego parts

result_2 <- merged_counts %>%
  filter(count == 1)
result_2<-result_2 %>%
  count(name.y)
result_2<-na.omit(result_2)
result_2<-arrange(result_2, desc(n))
result_2 <- filter(result_2, name.y != "Database Sets")
result_2<-head(result_2,20)

# Excluding Database Sets
ggplot(result_2, aes(x = n, y = reorder(name.y, n))) +
  geom_col(fill = coral) +
  labs(title = "20 Themes with the Highest Amount of Unique LEGO Parts",
       x = "Number of Unique Parts",
       y = "Theme")+
  theme(plot.background = element_rect(fill=background_color),
        plot.title=element_text(size=title_size, colour = font_color, hjust = 4),
        axis.title.x = element_text(size=label_size, colour = font_color, hjust=0.2),
        axis.title.y = element_text(size=label_size, colour = font_color),
        axis.text = element_text(size=tick_size, color = tick_color),
        panel.grid = element_line(color="#DDDDDD"),
        panel.background = element_rect(fill=background_color))

Average and maximum parts per theme

theme_part_counts <- merged_df %>%
  group_by(name.y) %>%
  summarize(avg_parts = mean(num_parts))

# Append the maximum number of parts found in a set with a given theme
theme_part_counts <- theme_part_counts %>%
  left_join(merged_df %>%
              group_by(name.y) %>%
              summarize(max_parts = max(num_parts)),
            by = "name.y")

top_20_avg_parts <- head(theme_part_counts %>% arrange(desc(avg_parts)),20)
top_20_max_parts <- head(theme_part_counts %>% arrange(desc(max_parts)),20)

# Second Plot
# Version A with Dots
ggplot(top_20_avg_parts, aes(x = reorder(name.y, avg_parts), y = avg_parts)) +
  geom_bar(stat = "identity", fill=coral) +
  coord_flip() +
  labs(x = "Theme", y = "Average Number of Parts") +
  geom_point(aes(reorder(name.y, avg_parts),y=max_parts), color=background_color, 
             fill= coral, shape=21, size=3, stroke=1.5) +
  ggtitle("Average and Maximum Number of Parts in Selected Themes") +
  theme(plot.background = element_rect(fill=background_color),
        plot.title=element_text(size=title_size, colour = font_color, hjust = 4),
        axis.title.x = element_text(size=label_size, colour = font_color, hjust=0.2),
        axis.title.y = element_text(size=label_size, colour = font_color),
        axis.text = element_text(size=tick_size, color = tick_color),
        panel.grid = element_line(color="#DDDDDD"),
        panel.background = element_rect(fill=background_color))


Sets


Number of sets over years

num_sets_per_year <- table(sets$year)
num_sets_per_year_df <- as.data.frame(num_sets_per_year)
colnames(num_sets_per_year_df) <- c("year", "num_sets")

selected_years <- c(1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020)

num_sets_per_year_df_new <- head(num_sets_per_year_df,-1)

exp_model <- lm(log(num_sets_per_year_df_new$num_sets)~ as.integer(num_sets_per_year_df_new$year))

ggplot(num_sets_per_year_df_new, aes(x = year, y = num_sets, group=1)) +
  geom_line(color = teal, size=1) +
  geom_smooth(method = "lm", formula = y ~ exp(coef(exp_model)[1] + coef(exp_model)[2] * x), 
              se = FALSE, color=orange) +
  scale_x_discrete(breaks = selected_years) +
  xlab("Year") +
  ylab("Number of Sets") +
  ggtitle("Number of Sets per Year") +
  theme(plot.background = element_rect(fill=background_color),
        plot.title=element_text(size=title_size, colour = font_color),
        axis.title.x = element_text(size=label_size, colour = font_color),
        axis.title.y = element_text(size=label_size, color = font_color),
        axis.text.x = element_text(size=tick_size, color = tick_color),
        axis.text.y = element_text(size=tick_size, color = tick_color),
        panel.background = element_rect(fill=background_color))


Colors


In the next section we will explore the colors data.

Transparency

On the figure we can observe that the most common colors used are different shades of grey. A characteristic tone of yellow, blue and red are also present in the treemap.

trans_count <- colors %>% group_by(is_trans) %>% summarise(count = n()) %>% 
  mutate(
    proportions = paste(round(count/sum(count),2)*100, "%", sep=''),
    is_transparent = ifelse(is_trans == 'f', "No", "Yes"))

# No legend
trans_count %>% 
  ggplot(aes(x="", y=count, fill=is_transparent, label=proportions)) + 
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start = 0) + 
  ggtitle("Proportion of transparent colors") +
  theme_void() +
  geom_text(nudge_y = c(-45,-19), nudge_x = c(0.0025,0.2), size=9, color=font_color, fontface="bold") +
  theme(plot.title = element_text(hjust=0.3, vjust=-1.5, color=font_color, size=title_size),
        plot.background = element_rect(fill=background_color),
        axis.ticks.length = unit(0, "pt"),
        legend.position = "none") +
  annotate(geom="text", x=1.9, y=20, label="Transparent", size=7, color=font_color) + 
  annotate(geom="text", x=1.9, y=125, label="Non-transparent", size=7, color=font_color)